Analyiis of scRNA-seq data provided by Eduardo Andrés León (eduardo.andres@csic.es, Instituto de Parasitología y Biología López Neyra, Granada). SC data was generated and pre-analysed sing BD Rhapsody systems and Illumina sequencers. BD includes pipelines for read mapping and putative cell filtering. An analysis using the Seurat package (https://satijalab.org/seurat/) was done by the CSIC staff using the seven bridges platform, but an obsolete normalization method was used. This R markdown file describes a preliminary analysis for the first of the four runs the project contains.
library(Seurat)
library(SeuratData)
library(dplyr)
library(patchwork)
library(sctransform)
library(ggplot2)
Import data from RDS file provided by Eduardo Andrés León (eduardo.andres@csic.es, Instituto de Parasitología y Biología López Neyra, Granada).
run1.raw <- readRDS("../Data/run1/C1_Seurat.rds")
run1.raw
## An object of class Seurat
## 34627 features across 3809 samples within 1 assay
## Active assay: RNA (34627 features, 0 variable features)
## 2 layers present: counts, data
## 1 dimensional reduction calculated: tsne
run1.raw@meta.data[1:5, ]
## orig.ident nCount_RNA nFeature_RNA Cell_Type_Experimental
## 450 SeuratProject 5177 1952 T_CD4_memory
## 776 SeuratProject 15741 3215 T_CD8_memory
## 828 SeuratProject 4063 343 Natural_killer
## 1237 SeuratProject 16465 3414 B
## 1546 SeuratProject 8684 3104 T_CD8_memory
## Sample_Tag Sample_Name
## 450 Multiplet Multiplet
## 776 SampleTag05_hs SampleTag05_hs
## 828 SampleTag06_hs SampleTag06_hs
## 1237 Multiplet Multiplet
## 1546 SampleTag05_hs SampleTag05_hs
run1.raw[["RNA"]]$counts[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## 450 776 828 1237 1546
## A1BG . . . . .
## A1BG-AS1 . . . 1 .
## A1CF . . . . .
## A2M . . . . .
## A2M-AS1 . . . . .
head(colnames(run1.raw[["RNA"]]$counts), 10)
## [1] "450" "776" "828" "1237" "1546" "3094" "3103" "3496" "5061" "6190"
# Cells are identified by an integer, not their UMI.
run1 <- readRDS("../Data/run1/C1_Seurat_Edu.rds")
run1
## An object of class Seurat
## 34627 features across 2151 samples within 1 assay
## Active assay: RNA (34627 features, 2000 variable features)
## 3 layers present: counts, data, scale.data
## 3 dimensional reductions calculated: tsne, pca, umap
run1@meta.data[1:5, ]
## orig.ident nCount_RNA nFeature_RNA Cell_Type_Experimental
## 776 SeuratProject 15741 3215 T_CD8_memory
## 1546 SeuratProject 8684 3104 T_CD8_memory
## 3103 SeuratProject 7442 2341 T_CD8_naive
## 3496 SeuratProject 14109 3444 T_CD8_memory
## 6190 SeuratProject 4366 1618 Natural_killer
## Sample_Tag Sample_Name percent.mt RNA_snn_res.0.5 seurat_clusters
## 776 SampleTag05_hs CaPBi-23-21-S 10.488533 1 1
## 1546 SampleTag05_hs CaPBi-23-21-S 5.354675 0 0
## 3103 SampleTag05_hs CaPBi-23-21-S 10.279495 1 1
## 3496 SampleTag03_hs CaPBi-23-20-S 12.786165 1 1
## 6190 SampleTag06_hs CaPBi-23-21-T 1.351351 4 4
## PreciseCell GeneralCell Type
## 776 NK_cell T_cells N
## 1546 T_cell:CD8+ T_cells N
## 3103 T_cell:CD8+_Central_memory T_cells N
## 3496 T_cell:CD8+_Central_memory T_cells AT
## 6190 Epithelial_cells:bronchial Epithelial_cells N
run1[["RNA"]]$counts[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## 776 1546 3103 3496 6190
## A1BG . . . . .
## A1BG-AS1 . . . . .
## A1CF . . . . .
## A2M . . . . .
## A2M-AS1 . . . . .
head(colnames(run1[["RNA"]]$counts), 10)
## [1] "776" "1546" "3103" "3496" "6190" "6544" "10044" "10067" "10757"
## [10] "11599"
run1.raw[["percent.mt"]] <- PercentageFeatureSet(run1.raw, pattern = "^MT-")
run1.raw[["percent.mt"]][1:5, ]
## [1] 6.857253 10.177244 76.913611 6.401458 5.043759
minCov = 1000 #if a sample has a good coverage (>=minCov), then don't set a lower thresold for nCount, it's already pretty good.
if (min(run1.raw$nCount_RNA) >= minCov) {
countLOW = min(run1.raw$nCount_RNA)
} else {
countLOW = quantile(run1.raw$nCount_RNA, prob = c(0.01))
}
countHIGH = quantile(run1.raw$nCount_RNA, prob = 0.99)
featureHIGH = quantile(run1.raw$nFeature_RNA, prob = 0.99)
featureLOW = quantile(run1.raw$nFeature_RNA, prob = 0.01)
# subset
run1.subset <- subset(run1.raw, subset = nFeature_RNA > featureLOW & nFeature_RNA < featureHIGH &
nCount_RNA > countLOW & nCount_RNA < countHIGH & percent.mt < 25)
run1.subset
## An object of class Seurat
## 34627 features across 2613 samples within 1 assay
## Active assay: RNA (34627 features, 0 variable features)
## 2 layers present: counts, data
## 1 dimensional reduction calculated: tsne
# # Get filtering parameters count.max <- round(mean(run1.raw$nCount_RNA) + 2 *
# sd(run1.raw$nCount_RNA), digits = -2) count.min <- round(mean(run1.raw$nCount_RNA) - 2 *
# sd(run1.raw$nCount_RNA), digits = -2) feat.max <- round(mean(run1.raw$nFeature_RNA) + 2 *
# sd(run1.raw$nFeature_RNA), digits = -2) feat.min <- round(mean(run1.raw$nFeature_RNA) - 2 *
# sd(run1.raw$nFeature_RNA), digits = -2) # Set minimum parameters to 0 if negative value if
# (count.min < 0){ count.min <- 0 } else { count.min <- count.min } if (feat.min < 0){
# feat.min <- 0 } else { feat.min <- feat.min } ## Filter cells run1.subset <-
# subset(run1.raw, subset = nFeature_RNA > feat.min & nFeature_RNA < feat.max & nCount_RNA <
# count.max & nCount_RNA > count.min & percent.mt < 25) run1.subset
# Filtered using 2.3.A.: 1% top and bottom percentiles.
run1.subset <- SCTransform(run1.subset, vst.flavor = "v2", vars.to.regress = "percent.mt")
# Visualize QC metrics as a violin plot
VlnPlot(run1.subset, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# Zoom in on nCount_RNA violin plot.
VlnPlot(run1.subset, features = "nCount_RNA", ncol = 1) + ylim(0, 25000) + NoLegend()
# Visualize relationships in metadata to detect outliers with FeatureScatter function
plot1 <- FeatureScatter(run1.subset, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(run1.subset, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
run1.subset <- FindVariableFeatures(run1.subset, selection.method = "vst", nfeatures = 2000)
# Genes with differencial expression in different cells are good candidates to be biomarkers.
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(run1.subset), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(run1.subset)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
run1.subset <- FindNeighbors(run1.subset, dims = 1:20)
run1.subset <- FindClusters(run1.subset) # Default resolution = 0.8
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2613
## Number of edges: 80705
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8964
## Number of communities: 19
## Elapsed time: 0 seconds
saveRDS(run1, file = "../Ouput/Run1_raw.rds")
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2151
## Number of edges: 64853
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8949
## Number of communities: 19
## Elapsed time: 0 seconds